# Add Libraries
library(readxl)
library(skimr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(flextable)
##
## Attaching package: 'flextable'
##
## The following object is masked from 'package:purrr':
##
## compose
library(lubridate)
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(RColorBrewer)
# Set the custom theme
theme_custom <- theme_minimal() +
theme (
plot.title = element_text(size = 14, hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
strip.text = element_text(size = 10),
axis.text = element_text(size = 8),
axis.title = element_text(size = 12),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
axis.line = element_line(color = "black"),
axis.ticks = element_line(color = "black"),
)
palette1 <- brewer.pal(8, "Set2")
palette2 <- brewer.pal(8, "Set3")
merged_palette <- c(palette1, palette2)
# Read data and transform categorical variables to factors
data_sales <- read_excel("data/raw/Coffee_Shop_Sales.xlsx") %>%
mutate(across(where(is.character), as.factor),
across(c(transaction_id, store_id, product_id), as.factor)) %>%
mutate(
transaction_date_and_time = as.POSIXct(paste(as.Date(transaction_date), hms::as_hms(transaction_time)), format = "%Y-%m-%d %H:%M:%S")
) %>%
relocate(transaction_date_and_time, .after = transaction_time)
# Take a look on data basic statistics
skim(data_sales)
| Name | data_sales |
| Number of rows | 149116 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| factor | 7 |
| numeric | 2 |
| POSIXct | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| transaction_id | 0 | 1 | FALSE | 149116 | 1: 1, 2: 1, 3: 1, 4: 1 |
| store_id | 0 | 1 | FALSE | 3 | 8: 50735, 3: 50599, 5: 47782 |
| store_location | 0 | 1 | FALSE | 3 | Hel: 50735, Ast: 50599, Low: 47782 |
| product_id | 0 | 1 | FALSE | 80 | 71: 3076, 50: 3053, 59: 3029, 54: 3026 |
| product_category | 0 | 1 | FALSE | 9 | Cof: 58416, Tea: 45449, Bak: 22796, Dri: 11468 |
| product_type | 0 | 1 | FALSE | 29 | Bre: 17183, Gou: 16912, Bar: 16403, Hot: 11468 |
| product_detail | 0 | 1 | FALSE | 80 | Cho: 3076, Ear: 3053, Dar: 3029, Mor: 3026 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| transaction_qty | 0 | 1 | 1.44 | 0.54 | 1.0 | 1.0 | 1 | 2.00 | 8 | ▇▁▁▁▁ |
| unit_price | 0 | 1 | 3.38 | 2.66 | 0.8 | 2.5 | 3 | 3.75 | 45 | ▇▁▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| transaction_date | 0 | 1 | 2023-01-01 00:00:00 | 2023-06-30 00:00:00 | 2023-04-24 00:00:00 | 181 |
| transaction_time | 0 | 1 | 1899-12-31 06:00:00 | 1899-12-31 20:59:32 | 1899-12-31 11:15:28 | 25762 |
| transaction_date_and_time | 0 | 1 | 2023-01-01 06:06:11 | 2023-06-30 18:57:19 | 2023-04-24 06:24:32 | 116129 |
# Check for missing data
missing_data <- data_sales %>%
summarise(across(everything(), ~ sum(is.na(.))))
# Check for duplicates
duplicates <- data_sales %>%
filter(duplicated(.))
# Detect outliers in unit_price within each product_id group
detect_outliers_by_product <- function(column) {
mean_val <- mean(column, na.rm = TRUE)
sd_val <- sd(column, na.rm = TRUE)
# Define bounds using the three-sigma rule
lower_bound <- mean_val - 3 * sd_val
upper_bound <- mean_val + 3 * sd_val
# Return outliers
outliers <- column < lower_bound | column > upper_bound
return(outliers)
}
# Apply outlier detection for unit_price by product_id
outliers_by_product_price <- data_sales %>%
group_by(product_id) %>%
mutate(price_outliers = detect_outliers_by_product(unit_price)) %>%
# Filter for products that have price outliers
filter(any(price_outliers)) %>%
ungroup()
# Visualize the outliers
# Boxplot for unit_price by product_id
ggplot(outliers_by_product_price, aes(x = as.factor(product_id), y = unit_price)) +
geom_boxplot(width = 0.9) +
labs(title = "Boxplot of Unit Price by Product ID", x = "Product ID", y = "Unit Price") +
coord_flip()+
theme_custom
Upon analysis, it was determined that outlier detection for transaction_qty is unnecessary, as the observed range (1–8) reflects typical and reasonable purchase quantities.
Similarly, no outliers were removed for unit_price, as all values, including price reductions of up to 50% (price_id = 9), are consistent with pricing strategies such as discounts or promotions.
Given the acceptability of the data, no further outlier treatment is required for either transaction_qty or unit_price, ensuring that the dataset remains representative of normal business operations.
# Function to calculate 95% confidence interval for the mean
ci_95 <- function(x) {
n <- sum(!is.na(x))
if (n < 3) return("NA")
se <- sd(x, na.rm = TRUE) / sqrt(n)
mean_x <- mean(x, na.rm = TRUE)
ci <- c(mean_x - 1.96 * se, mean_x + 1.96 * se)
paste0(round(ci[1], 2), " - ", round(ci[2], 2))
}
# List of descriptive statistics
statistics <- list(
`_Number of values` = ~as.character(sum(!is.na(.x))),
`_No data` = ~as.character(sum(is.na(.x))),
`_Mean` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(mean(.x, na.rm = TRUE) %>% round(2))),
`_Median` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(median(.x, na.rm = TRUE) %>% round(2))),
`_SD` = ~ifelse(sum(!is.na(.x)) < 3, "NA", as.character(sd(.x, na.rm = TRUE) %>% round(2))),
`_Q1 - Q3` = ~ifelse(sum(!is.na(.x)) == 0, "NA", paste0(as.character(quantile(.x, 0.25, na.rm = TRUE) %>% round(2)), " - ", as.character(quantile(.x, 0.75, na.rm = TRUE) %>% round(2)))),
`_IQR` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(IQR(.x, na.rm = TRUE) %>% round(2))),
`_95% CI` = ~ci_95(.x),
`_min` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(min(.x, na.rm = TRUE) %>% round(2))),
`_max` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(max(.x, na.rm = TRUE) %>% round(2)))
)
# Summarize the statistics for each numeric variable grouped by TenYearCHD
data_sales %>%
select(where(is.numeric)) %>%
summarise(across(everything(), statistics)) %>%
pivot_longer(cols = everything(), names_sep = "__", names_to = c("Variable", "Stat")) %>%
rename(`Value` = value) %>%
flextable() %>%
theme_zebra() %>%
merge_v("Variable") %>%
autofit()
Variable | Stat | Value |
|---|---|---|
transaction_qty | Number of values | 149116 |
No data | 0 | |
Mean | 1.44 | |
Median | 1 | |
SD | 0.54 | |
Q1 - Q3 | 1 - 2 | |
IQR | 1 | |
95% CI | 1.44 - 1.44 | |
min | 1 | |
max | 8 | |
unit_price | Number of values | 149116 |
No data | 0 | |
Mean | 3.38 | |
Median | 3 | |
SD | 2.66 | |
Q1 - Q3 | 2.5 - 3.75 | |
IQR | 1.25 | |
95% CI | 3.37 - 3.4 | |
min | 0.8 | |
max | 45 |
# Clean and summarize the categorical data for all factor variables
data_sales %>%
select(-transaction_id, -product_id, -product_detail) %>%
select(where(is.factor)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
group_by(Variable, Value) %>%
summarise(n = n(), .groups = 'drop') %>%
group_by(Variable) %>%
mutate(`No data` = sum(is.na(Value)),
`% by group` = paste0(round((n / sum(n)) * 100, 2), " %") ) %>%
ungroup() %>%
select(Variable, Value, n, `% by group`, `No data`) %>%
arrange(Variable, Value) %>%
flextable() %>%
theme_zebra() %>%
merge_v("Variable") %>%
autofit()
Variable | Value | n | % by group | No data |
|---|---|---|---|---|
product_category | Bakery | 22,796 | 15.29 % | 0 |
Branded | 747 | 0.5 % | 0 | |
Coffee | 58,416 | 39.17 % | 0 | |
Coffee beans | 1,753 | 1.18 % | 0 | |
Drinking Chocolate | 11,468 | 7.69 % | 0 | |
Flavours | 6,790 | 4.55 % | 0 | |
Loose Tea | 1,210 | 0.81 % | 0 | |
Packaged Chocolate | 487 | 0.33 % | 0 | |
Tea | 45,449 | 30.48 % | 0 | |
product_type | Drinking Chocolate | 266 | 0.18 % | 0 |
Barista Espresso | 16,403 | 11 % | 0 | |
Biscotti | 5,711 | 3.83 % | 0 | |
Black tea | 303 | 0.2 % | 0 | |
Brewed Black tea | 11,350 | 7.61 % | 0 | |
Brewed Chai tea | 17,183 | 11.52 % | 0 | |
Brewed Green tea | 5,671 | 3.8 % | 0 | |
Brewed herbal tea | 11,245 | 7.54 % | 0 | |
Chai tea | 443 | 0.3 % | 0 | |
Clothing | 221 | 0.15 % | 0 | |
Drip coffee | 8,477 | 5.68 % | 0 | |
Espresso Beans | 319 | 0.21 % | 0 | |
Gourmet Beans | 366 | 0.25 % | 0 | |
Gourmet brewed coffee | 16,912 | 11.34 % | 0 | |
Green beans | 134 | 0.09 % | 0 | |
Green tea | 159 | 0.11 % | 0 | |
Herbal tea | 305 | 0.2 % | 0 | |
Hot chocolate | 11,468 | 7.69 % | 0 | |
House blend Beans | 183 | 0.12 % | 0 | |
Housewares | 526 | 0.35 % | 0 | |
Organic Beans | 415 | 0.28 % | 0 | |
Organic brewed coffee | 8,489 | 5.69 % | 0 | |
Organic Chocolate | 221 | 0.15 % | 0 | |
Pastry | 6,912 | 4.64 % | 0 | |
Premium Beans | 336 | 0.23 % | 0 | |
Premium brewed coffee | 8,135 | 5.46 % | 0 | |
Regular syrup | 4,979 | 3.34 % | 0 | |
Scone | 10,173 | 6.82 % | 0 | |
Sugar free syrup | 1,811 | 1.21 % | 0 | |
store_id | 3 | 50,599 | 33.93 % | 0 |
5 | 47,782 | 32.04 % | 0 | |
8 | 50,735 | 34.02 % | 0 | |
store_location | Astoria | 50,599 | 33.93 % | 0 |
Hell's Kitchen | 50,735 | 34.02 % | 0 | |
Lower Manhattan | 47,782 | 32.04 % | 0 |
# Aggregate the total sales by product category
sales_by_category <- data_sales %>%
select(product_category, transaction_qty, unit_price) %>%
group_by(product_category) %>%
summarise(total_sales_qty = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
arrange(desc(total_sales_value))
# Create a custom function for generating the bar plot
plot_sales_categories <- function(data, y_var, title, y_label, y_breaks_by, label_prefix = "") {
ggplot(data, aes(x = reorder(product_category, -get(y_var)), y = get(y_var), fill = product_category)) +
geom_bar(stat = "identity") +
geom_label(aes(label = paste(label_prefix, round(get(y_var)), sep = "")),
fill = "white",
size = 3,
fontface = "bold",
label.size = 0.3,
vjust = -0.2,
position = position_dodge(width = 0.9)) +
scale_y_continuous(labels = comma,
breaks = seq(0, max(data[[y_var]]) * 1.2, by = y_breaks_by),
limits = c(0, max(data[[y_var]]) * 1.2)) +
scale_fill_manual(values = merged_palette) +
labs(title = title,
x = "Product Category",
y = y_label) +
theme_custom +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
}
# Call the function for total sales quantity
plot_sales_categories(sales_by_category,
y_var = "total_sales_qty",
title = "Total Sales Quantity by Product Category",
y_label = "Total Sales Quantity",
y_breaks_by = 10000,
label_prefix = "n = ")
# Call the function for total sales value
plot_sales_categories(sales_by_category,
y_var = "total_sales_value",
title = "Total Sales Income by Product Category",
y_label = "Total Sales Income",
y_breaks_by = 25000,
label_prefix = "$")
# Summarize average sales per transaction by product category
avg_sales_per_transaction_category <- data_sales %>%
group_by(product_category) %>%
summarise(
avg_sales_per_transaction = sum(transaction_qty * unit_price, na.rm = TRUE) / n()
) %>%
arrange(desc(avg_sales_per_transaction))
# Plot the average sales per transaction by product category
ggplot(avg_sales_per_transaction_category, aes(x = reorder(product_category, avg_sales_per_transaction), y = avg_sales_per_transaction, fill = product_category)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = merged_palette) +
labs(title = "Average Sales Per Transaction by Product Category",
x = "Product Category",
y = "Average Sales Per Transaction ($)") +
geom_label(aes(label = paste0("$", comma(round(avg_sales_per_transaction, 2)))),
position = position_stack(vjust = 0.5),
size = 3, fill = "white") +
theme_custom
Coffee beans and Branded products may be higher-priced items, leading to higher per-transaction sales even though they have fewer units sold. Coffee and Tea are likely lower-priced items but are sold in much larger quantities, contributing more significantly to the overall revenue, even though their average sales per transaction are lower.
Key Observations:
Coffee is the top-performing product category in both quantity and value. It has the highest number of units sold and also generates the most revenue.
Tea is the second most popular category, both in terms of the number of units sold and total revenue, though its sales figures are significantly lower than Coffee.
Bakery and Drinking Chocolate show moderate sales in both quantity and value, ranking third and fourth in both metrics.
There’s a clear concentration of sales in a few major categories (Coffee, Tea, and Bakery), with the other categories contributing much smaller amounts.
There are 29 distinct product types in the dataset. To streamline the analysis, I chose to focus on the product types that belong to the top-selling product categories. These top categories were identified as those contributing at least 10% of the total revenue. This threshold is flexible and can be adjusted as needed.
# Calculate total sales income across all categories
total_sales_income <- sum(sales_by_category$total_sales_value, na.rm = TRUE)
# Filter product categories that contribute at least 10% of the total income
significant_categories <- sales_by_category %>%
filter(total_sales_value >= 0.1 * total_sales_income) %>%
pull(product_category)
# Filter the data for the most valuable product_category
sales_by_type_significant <- data_sales %>%
filter(product_category %in% significant_categories) %>%
group_by(product_category, product_type) %>%
summarise(
total_sales_qty = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE),
count_transactions = n() # Count of transactions for each product type
) %>%
arrange(desc(total_sales_value))
## `summarise()` has grouped output by 'product_category'. You can override using
## the `.groups` argument.
# Function to create bar plots for product types within categories
plot_sales_categories <- function(data, x_var, y_var, fill_var, title, y_label, y_breaks_by, label_prefix = "") {
ggplot(data, aes(x = reorder(get(x_var), -get(y_var)), y = get(y_var), fill = get(fill_var))) +
geom_bar(stat = "identity", position = position_dodge2(width = 0.9)) +
geom_text(aes(label = paste(label_prefix, round(get(y_var)), sep = ""),
y = get(y_var) / 2),
size = 3,
fontface = "bold",
colour = "white",
position = position_dodge2(width = 0.9),
vjust = 0.5) +
scale_y_continuous(labels = comma,
breaks = seq(0, max(data[[y_var]]) * 1.2, by = y_breaks_by),
limits = c(0, max(data[[y_var]]) * 1.2)) +
scale_fill_manual(values = merged_palette) +
labs(title = title,
x = "Product Category",
y = y_label,
fill = "Product Type") +
coord_flip()+
theme_custom +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right")
}
# Call the function for total sales quantity
plot_sales_categories(sales_by_type_significant,
x_var = "product_category",
y_var = "total_sales_qty",
fill_var = "product_type",
title = "Total Sales Quantity by Product Category and Product Type",
y_label = "Total Sales Quantity",
y_breaks_by = 2500,
label_prefix = "n = ")
# Call the function for total sales value
plot_sales_categories(sales_by_type_significant,
x_var = "product_category",
y_var = "total_sales_value",
fill_var = "product_type",
title = "Total Sales Value by Product Category and Product Type",
y_label = "Total Sales Value",
y_breaks_by = 10000,
label_prefix = "$")
The analysis of sales by product type of the best selling categories shows:
The top three best-selling product types were Brewed Chai Tea (sold 26,250 times), Gourmet Brewed Coffee (sold 25,973 times), and Barista Espresso (sold 24,943 times). The top three product types generating the highest profit were Barista Espresso ($91,406), Brewed Chai Tea ($77,082), and Hot Chocolate ($72,416 from 17,457 sales).
It is important to note that the best-selling product type did not generate the highest profit, but still are in top three product types generating the highest profit. Let’s ensure there is correlation between products sold and total sales value.
# Create scatter plot showing the relationship between units sold and total sales value (profit)
ggplot(sales_by_type_significant, aes(x = total_sales_qty, y = total_sales_value)) +
geom_point(aes(color = product_type), size = 4, alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "darkred", size = 0.5) +
scale_color_manual(values = merged_palette) +
labs(title = "Correlation Between Units Sold and Total Profit by Product Type",
x = "Units Sold",
y = "Total Sales Value ($)",
color = "Product Type") +
theme_custom +
theme(
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
legend.key.size = unit(0.1, "cm")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
The pink line represents the trend line (using linear regression), showing the general positive relationship between units sold and total sales value. This means that, as the number of units sold increases, total sales value generally increases as well. Since The correlation between sales and profit is positive, but the relationship isn’t perfect—some products deviate from the trend. No product types were discovered that had low sales but high profit, indicating that products that generate significant revenue generally also have high sales volumes.
Ensuring that all stores are regularly stocked with these high-performing products is crucial, as they play a significant role in both sales volume and revenue.
There are 80 unique product IDs in the dataset, but for this analysis, I concentrated on the Top 10 items that contribute the most significant portion of the total profit or were best-sellers.
# Calculate the total profit for each product_id
sales_profit <- data_sales %>%
group_by(product_id, product_detail) %>%
summarise(
total_sales_qty = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
) %>%
ungroup() %>%
arrange(desc(total_sales_value)) # Sort by total sales value in descending order
## `summarise()` has grouped output by 'product_id'. You can override using the
## `.groups` argument.
# Select the top 10 items contributing the most profit
top_10_products_sales <- sales_profit %>%
slice_max(order_by = total_sales_value, n = 10)
# Select the top 10 items by quantity sold
top_10_products_sales <- sales_profit %>%
arrange(desc(total_sales_qty)) %>%
slice(1:10)
ggplot(top_10_products_sales, aes(x = reorder(product_id, total_sales_value), y = total_sales_value, fill = product_detail)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = paste("$", round(total_sales_value))),
size = 3,
fontface = "bold",
colour = "white",
position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = merged_palette) +
labs(title = "Top 10 Products by Total Profit",
x = "Product ID",
y = "Total Sales Value ($)",
fill = "Product Detail") +
theme_custom +
theme(
legend.title = element_text(size = 12),
legend.text = element_text(size = 8),
legend.key.size = unit(0.3, "cm")
)
ggplot(top_10_products_sales, aes(x = reorder(product_id, total_sales_qty), y = total_sales_qty, fill = product_detail)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = paste("n = ", round(total_sales_qty))),
size = 3,
fontface = "bold",
colour = "white",
position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = merged_palette) +
labs(title = "Top 10 Products by Sales Quantity",
x = "Product ID",
y = "Total Quantity",
fill = "Product Detail") +
theme_custom +
theme(
legend.title = element_text(size = 12),
legend.text = element_text(size = 8),
legend.key.size = unit(0.3, "cm")
)
Since the previous analysis didn’t provide sufficient insights, I decided to focus on identifying which drink sizes were the best sellers within the coffee, tea, and drinking chocolate categories.
# Extract drink size from product_detail
sales_by_size <- data_sales %>%
filter(product_category %in% c("Coffee", "Tea", "Drinking Chocolate")) %>%
mutate(
drink_size = case_when(
grepl("Sm", product_detail) ~ "Small",
grepl("Rg", product_detail) ~ "Regular",
grepl("Lg", product_detail) ~ "Large",
TRUE ~ "Unknown"
)
) %>%
group_by(product_category, drink_size) %>%
summarise(total_sales_qty = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
arrange(desc(total_sales_qty))
## `summarise()` has grouped output by 'product_category'. You can override using
## the `.groups` argument.
# Plot the total sales by drink size for each category
ggplot(sales_by_size, aes(x = product_category, y = total_sales_qty, fill = drink_size)) +
geom_bar(stat = "identity", position = position_dodge2(width = 0.9)) +
geom_label(aes(label = paste("n=", round(total_sales_qty), sep = ""), group = drink_size),
fill = "white",
size = 2,
fontface = "bold",
label.size = 0.2,
vjust = -0.5,
position = position_dodge2(width = 0.9)) +
scale_fill_manual(values = palette2) +
scale_y_continuous(labels = comma,
breaks = seq(0, max(sales_by_size$total_sales_qty) * 1.2, by = 5000),
limits = c(0, max(sales_by_size$total_sales_qty) * 1.2)) +
labs(title = "Best-Selling Drink Sizes by Category",
x = "Product Category",
y = "Total Sales Quantity",
fill = "Drink Size") +
theme_custom
ggplot(sales_by_size, aes(x = product_category, y = total_sales_value, fill = drink_size)) +
geom_bar(stat = "identity", position = position_dodge2(width = 0.9)) +
geom_label(aes(label = paste("$", round(total_sales_value), sep = ""), group = drink_size),
fill = "white",
size = 2,
fontface = "bold",
label.size = 0.2,
vjust = -0.5,
position = position_dodge2(width = 0.9)) +
scale_fill_manual(values = palette2) +
scale_y_continuous(labels = comma,
breaks = seq(0, max(sales_by_size$total_sales_value) * 1.2, by = 25000),
limits = c(0, max(sales_by_size$total_sales_value) * 1.2)) +
labs(title = "Total Sales Value by Drink Size and Category",
x = "Product Category",
y = "Total Sales Value ($)",
fill = "Drink Size") +
theme_custom
This visualization helps highlight which drink sizes are most popular in each product category, with Large and Regular sizes standing out across all categories.
These insights suggest that stocking and promoting Large and Regular drinks should be a priority for maximizing both sales volume and profit in these categories. Efforts should be focused on ensuring these sizes are always available, especially in the Tea and Drinking Chocolate categories.
# Summarize sales by store ID
sales_by_store <- data_sales %>%
group_by(store_location, store_id) %>%
summarise(
total_sales_qty = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
) %>%
arrange(desc(total_sales_value)) %>%
pivot_longer(cols = c(total_sales_qty, total_sales_value),
names_to = "sales_metric",
values_to = "sales_value")
## `summarise()` has grouped output by 'store_location'. You can override using
## the `.groups` argument.
# Plot both total sales quantity and value in a faceted plot
ggplot(sales_by_store, aes(x = reorder(store_location, sales_value), y = sales_value, fill = store_location)) +
geom_bar(stat = "identity", position = position_dodge2(width = 0.9)) +
geom_label(aes(label = comma(round(sales_value))),
fill = "white",
position = position_dodge2(width = 0.9),
vjust = -0.3,
size = 3,
fontface = "bold") +
scale_fill_manual(values = palette2) +
scale_y_continuous(labels = comma,
breaks = seq(0, max(sales_by_store$sales_value) * 1.2, by = 50000),
limits = c(0, max(sales_by_store$sales_value) * 1.2)) +
facet_wrap(~ sales_metric, scales = "free") +
labs(title = "Total Sales Quantity and Value by Store Location",
x = "Store Location",
y = "Sales Value or Quantity",
fill = "Store Location") +
theme_custom
sales_by_category_stores <- data_sales %>%
group_by(store_location, product_category) %>%
summarise(
total_sales_qty = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
) %>%
arrange(store_location, desc(total_sales_value))
## `summarise()` has grouped output by 'store_location'. You can override using
## the `.groups` argument.
ggplot(sales_by_category_stores, aes(x = reorder(product_category, total_sales_value), y = total_sales_value, fill = product_category)) +
geom_bar(stat = "identity", position = "dodge2") +
facet_wrap(~ store_location) +
scale_fill_manual(values = merged_palette) +
scale_y_continuous(labels = comma,
breaks = seq(0, max(sales_by_category_stores$total_sales_value) * 1.2, by = 10000)) +
labs(title = "Sales by Product Category and Store Location (Profit)",
x = "Product Category",
y = "Total Sales Value ($)",
fill = "Product Category") +
theme_custom +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(sales_by_category_stores, aes(x = reorder(product_category, total_sales_qty), y = total_sales_qty, fill = product_category)) +
geom_bar(stat = "identity", position = "dodge2") +
facet_wrap(~ store_location) +
scale_fill_manual(values = merged_palette) +
scale_y_continuous(labels = comma,
breaks = seq(0, max(sales_by_category_stores$total_sales_qty) * 1.2, by = 5000)) +
labs(title = "Sales by Product Category and Store Location (Sold Items)",
x = "Product Category",
y = "Total Sales Value ($)",
fill = "Product Category") +
theme_custom +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Calculate average sales per transaction by store
avg_sales_per_transaction_store <- data_sales %>%
group_by(store_location) %>%
summarise(
avg_sales_per_transaction = sum(transaction_qty * unit_price, na.rm = TRUE) / n()
) %>%
arrange(desc(avg_sales_per_transaction))
# Plot average sales per transaction by store
ggplot(avg_sales_per_transaction_store, aes(x = reorder(store_location, avg_sales_per_transaction), y = avg_sales_per_transaction)) +
geom_segment(aes(x = reorder(store_location, avg_sales_per_transaction), xend = reorder(store_location, avg_sales_per_transaction),
y = 0, yend = avg_sales_per_transaction), color = "grey30") +
geom_point(aes(color = store_location), size = 10, alpha = 0.2) +
geom_point(aes(color = store_location), size = 6) +
geom_label(aes(label = comma(round(avg_sales_per_transaction, 2))),
vjust = -0.5,
size = 4, fill = "white") +
scale_color_manual(values = palette1) +
scale_y_continuous(labels = comma,
breaks = seq(0, max(avg_sales_per_transaction_store$avg_sales_per_transaction) * 1.2, by = 0.5),
limits = c(0, max(avg_sales_per_transaction_store$avg_sales_per_transaction) * 1.2) ) +
labs(title = "Average Sales per Transaction by Store Location",
x = "Store Location",
y = "Average Sales per Transaction ($)",
color = "Store Location") +
theme_custom
There is little variation in the quantity of products sold, total profit, and sales across different product categories among the three stores. Similarly, the average sales per transaction are quite consistent across the stores, indicating that customer spending patterns are generally similar.
sales_by_month_category <- data_sales %>%
mutate(
month = floor_date(transaction_date_and_time, "month"),
month_year = format(month, "%b-%Y")
) %>%
group_by(month, month_year, product_category) %>%
summarise(
total_sales_qty = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
) %>%
ungroup()
## `summarise()` has grouped output by 'month', 'month_year'. You can override
## using the `.groups` argument.
# Plot the sales trends by Month-Year
ggplot(sales_by_month_category, aes(x = reorder(month_year, month), y = total_sales_value, color = product_category, group = product_category)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
scale_color_manual(values = merged_palette) +
labs(title = "Sales Trend Over Time by Product Category",
x = "Month-Year",
y = "Total Sales Value ($)",
color = "Product Category") +
theme_custom +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Group sales data by week and product category
sales_by_week_category <- data_sales %>%
mutate(
week = floor_date(transaction_date_and_time, "week"),
week_year = format(week, "%U-%Y")
) %>%
group_by(week, week_year, product_category) %>%
summarise(
total_sales_qty = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
) %>%
ungroup()
## `summarise()` has grouped output by 'week', 'week_year'. You can override using
## the `.groups` argument.
# Plot the sales trends by Week-Year
ggplot(sales_by_week_category, aes(x = week_year, y = total_sales_value, color = product_category, group = product_category)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
scale_color_manual(values = merged_palette) +
labs(title = "Sales Trend Over Time by Product Category (Weekly)",
x = "Week-Year",
y = "Total Sales Value ($)",
color = "Product Category") +
theme_custom +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The graph shows that sales of Coffee and Tea products increase significantly from February to June. Sales in other product categories, such as Bakery, Drinking Chocolate, and Coffee beans, also rise over this period, though less sharply than Coffee and Tea. Other categories also experience growth, but more gradually.
The demand for Coffee and Tea remained steady in January, then started to rise from February through June. This pattern suggests that colder weather might have sustained demand early in the year, while warmer spring months encouraged even more consumption. Meanwhile, categories like Flavours and Packaged Chocolate maintained steady, year-round demand with minimal seasonal impact.
To confirm these patterns, analyzing data over multiple years or incorporating weather data could help determine if these trends repeat each year.
# Group data by day of the week
sales_by_day_of_week <- data_sales %>%
mutate(day_of_week = wday(transaction_date_and_time, label = TRUE, abbr = TRUE)) %>%
group_by(day_of_week) %>%
summarise(
transaction_count = sum(transaction_qty, na.rm = TRUE),
total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
)
# Calculate the scaling factor
scaling_factor <- max(sales_by_day_of_week$total_sales_value) / max(sales_by_day_of_week$transaction_count)
ggplot(sales_by_day_of_week, aes(x = day_of_week)) +
geom_bar(aes(y = transaction_count, fill = day_of_week), stat = "identity", alpha = 0.7) +
geom_line(aes(y = total_sales_value / scaling_factor, group = 1),
color = "darkblue", size = 1, linetype = "dashed") +
geom_point(aes(y = total_sales_value / scaling_factor),
color = "darkblue", size = 3) +
geom_text(aes(y = total_sales_value / scaling_factor,
label = paste0("$", comma(round(total_sales_value, 2)))),
vjust = -1, hjust = 0.5, color = "darkblue", size = 3) +
scale_y_continuous(
name = "Transaction Count",
sec.axis = sec_axis(~ . * scaling_factor, name = "Total Sales Value ($)"),
limits = c(0, max(sales_by_day_of_week$transaction_count) * 1.2)
) +
scale_fill_manual(values = merged_palette) +
labs(title = "Frequency of Transactions and Total Sales Income by Day of the Week",
x = "Day of the Week") +
theme_custom +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
Product sales and profit stayed steady across the week, with a small drop on weekends.
# Define time periods for time of day
data_sales <- data_sales %>%
mutate(
time_of_day = case_when(
hour(transaction_date_and_time) >= 6 & hour(transaction_date_and_time) < 12 ~ "Morning",
hour(transaction_date_and_time) >= 12 & hour(transaction_date_and_time) < 17 ~ "Afternoon",
hour(transaction_date_and_time) >= 17 & hour(transaction_date_and_time) < 21 ~ "Evening"
),
time_of_day = factor(time_of_day, levels = c("Morning", "Afternoon", "Evening"))
)
# Summarize total sales quantity by time of day and product category
sales_by_time_of_day <- data_sales %>%
group_by(time_of_day, product_category) %>%
summarise(
total_sales_qty = sum(transaction_qty, na.rm = TRUE)
) %>%
ungroup()
## `summarise()` has grouped output by 'time_of_day'. You can override using the
## `.groups` argument.
# Calculate total sales value by time of day across all product categories
total_sales_value_by_time <- data_sales %>%
group_by(time_of_day) %>%
summarise(total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
ungroup()
# Calculate the scaling factor
scaling_factor <- max(total_sales_value_by_time$total_sales_value) / max(sales_by_time_of_day$total_sales_qty)
# Plot the sales by time of day with a single line for total sales value
ggplot() +
geom_bar(data = sales_by_time_of_day, aes(x = time_of_day, y = total_sales_qty, fill = product_category),
stat = "identity", position = "stack", alpha = 0.7) +
geom_line(data = total_sales_value_by_time, aes(x = time_of_day, y = total_sales_value / scaling_factor, group = 1),
color = "darkblue", size = 0.5, linetype = "dashed") +
geom_point(data = total_sales_value_by_time,
aes(x = time_of_day, y = total_sales_value / scaling_factor),
color = "darkblue",
size = 2) +
geom_text(data = total_sales_value_by_time,
aes(x = time_of_day, y = total_sales_value / scaling_factor,
label = paste0("$", comma(round(total_sales_value, 2)))),
vjust = -1.2,
color = "darkblue",
size = 3) +
scale_y_continuous(name = "Total Products Sold",
sec.axis = sec_axis(~ . * scaling_factor, name = "Total Sales Value ($)")) +
scale_fill_manual(values = merged_palette) +
labs(title = "Sales Comparison by Time of Day and Product Category",
x = "Time of Day",
y = "Total Products Sold",
fill = "Product Category") +
theme_custom +
theme(legend.position = "right")
The analysis of sales by time of day reveals that Morning hours drive the highest product sales and revenue, reaching $338,289. Afternoon sales contribute a moderate $204,721, while Evening hours show the lowest revenue at $105,803.
This trend suggests that Morning hours are peak shopping times, making it essential to ensure optimal product availability and staffing during these hours to maximize sales potential.
# Monthly Sales Growth
monthly_sales <- data_sales %>%
mutate(month = floor_date(transaction_date_and_time, "month")) %>%
group_by(month) %>%
summarise(total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
arrange(month) %>%
mutate(monthly_growth = (total_sales_value - lag(total_sales_value)) / lag(total_sales_value) * 100) # actual minus previous
# Calculate scaling factor for secondary y-axis
monthly_growth_scale <- max(monthly_sales$total_sales_value, na.rm = TRUE) / max(monthly_sales$monthly_growth, na.rm = TRUE)
# Plot Monthly Sales and Growth Rate
ggplot(monthly_sales, aes(x = month)) +
geom_bar(aes(y = total_sales_value), stat = "identity", fill = "skyblue", alpha = 0.8) +
geom_line(aes(y = monthly_growth * monthly_growth_scale),
color = "darkred", linetype = "dashed", size = 1) +
geom_point(aes(y = monthly_growth * monthly_growth_scale),
color = "darkred", size = 2) +
scale_y_continuous(
name = "Total Sales Value ($)",
sec.axis = sec_axis(~ . / monthly_growth_scale, name = "Monthly Growth Rate (%)"),
limits = c(min(monthly_sales$monthly_growth), max(monthly_sales$total_sales_value) * 1.2 )
) +
geom_label(aes(
y = monthly_growth * monthly_growth_scale,
label = paste0(round(monthly_growth, 2), "%")
),
vjust = -0.5,
size = 3) +
labs(title = "Monthly Sales Growth or Decline",
x = "Month",
y = "Total Sales Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_label()`).
# Weekly Sales Growth
weekly_sales <- data_sales %>%
mutate(week = floor_date(transaction_date_and_time, "week")) %>%
group_by(week) %>%
summarise(total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
arrange(week) %>%
mutate(weekly_growth = (total_sales_value - lag(total_sales_value)) / lag(total_sales_value) * 100)
# Calculate scaling factor for weekly growth
weekly_growth_scale <- max(weekly_sales$total_sales_value, na.rm = TRUE) / max(weekly_sales$weekly_growth, na.rm = TRUE)
# Plot Weekly Sales and Growth Rate
ggplot(weekly_sales, aes(x = week)) +
geom_bar(aes(y = total_sales_value), stat = "identity", fill = "pink", alpha = 0.8) +
geom_line(aes(y = weekly_growth * weekly_growth_scale),
color = "darkblue", linetype = "dashed", size = 1) +
geom_point(aes(y = weekly_growth * weekly_growth_scale),
color = "darkblue", size = 2) +
scale_y_continuous(
name = "Total Sales Value ($)",
sec.axis = sec_axis(~ . / weekly_growth_scale, name = "Weekly Growth Rate (%)"),
limits = c(min(weekly_sales$weekly_growth), max(weekly_sales$total_sales_value) * 1.2)
) +
geom_label(aes(
y = weekly_growth * weekly_growth_scale,
label = paste0(round(weekly_growth, 2), "%")
),
vjust = -0.2,
size = 3) +
labs(title = "Weekly Sales Growth or Decline",
x = "Week",
y = "Total Sales Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_label()`).
The strongest sales growth occurs from March to May, signaling a potentially seasonal or trend-based increase in demand during this period. Although June maintains high sales, the growth rate begins to level off, suggesting a plateau.
The weekly growth rate shows high variability, with some weeks experiencing sharp growth while others show significant declines. This may indicate a pattern of inconsistent demand or varying promotional strategies.